data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", header=TRUE, sep=",")
summary(data)
##  M            X0.455          X0.365           X0.095      
##  F:1307   Min.   :0.075   Min.   :0.0550   Min.   :0.0000  
##  I:1342   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150  
##  M:1527   Median :0.545   Median :0.4250   Median :0.1400  
##           Mean   :0.524   Mean   :0.4079   Mean   :0.1395  
##           3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650  
##           Max.   :0.815   Max.   :0.6500   Max.   :1.1300  
##      X0.514          X0.2245           X0.101            X0.15       
##  Min.   :0.0020   Min.   :0.0010   Min.   :0.00050   Min.   :0.0015  
##  1st Qu.:0.4415   1st Qu.:0.1860   1st Qu.:0.09337   1st Qu.:0.1300  
##  Median :0.7997   Median :0.3360   Median :0.17100   Median :0.2340  
##  Mean   :0.8288   Mean   :0.3594   Mean   :0.18061   Mean   :0.2389  
##  3rd Qu.:1.1533   3rd Qu.:0.5020   3rd Qu.:0.25300   3rd Qu.:0.3290  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.76000   Max.   :1.0050  
##       X15        
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 9.932  
##  3rd Qu.:11.000  
##  Max.   :29.000
colnames(data)
## [1] "M"       "X0.455"  "X0.365"  "X0.095"  "X0.514"  "X0.2245" "X0.101" 
## [8] "X0.15"   "X15"
colnames(data) <- c("sex", "length", "diameter", "height", 
                "whole_weight", "shucked_weight",
                "viscera_weight", "shell_weight", "rings")

colnames(data)
## [1] "sex"            "length"         "diameter"       "height"        
## [5] "whole_weight"   "shucked_weight" "viscera_weight" "shell_weight"  
## [9] "rings"
data$sex <- factor(c("Female", "Infant", "Male")[data$sex])
par(mfrow=c(1,3)) 
hist(data$diameter, main = "Диаметр, мм")
hist(data$height, main = "Высота, мм")
hist(data$whole_weight, main = "Полный вес, гр")

Видим ассиметрию https://en.wikipedia.org/wiki/Skewness и выбросы (от них нужно избавиться)

Визуализация возможных зависимостей

par(mfrow=c(1,2)) 
plot(data$diameter, data$whole_weight,'p',main = "Зависимость веса от диаметра")
plot(data$height, data$whole_weight,'p',main = "Зависимость веса от высоты")

Исследование увиденой зависимости 1.Построить линейные модели при помощи функции lm, посмотреть их характеристики

Линейная модель зависимости веса от диаметра

diameter_linear_model <- lm(whole_weight~diameter, data)
diameter_linear_model
## 
## Call:
## lm(formula = whole_weight ~ diameter, data = data)
## 
## Coefficients:
## (Intercept)     diameter  
##      -1.036        4.573
summary(diameter_linear_model)
## 
## Call:
## lm(formula = whole_weight ~ diameter, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56747 -0.12310 -0.03997  0.07211  1.14104 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.03645    0.01216   -85.2   <2e-16 ***
## diameter     4.57295    0.02898   157.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1858 on 4174 degrees of freedom
## Multiple R-squared:  0.8565, Adjusted R-squared:  0.8564 
## F-statistic: 2.491e+04 on 1 and 4174 DF,  p-value: < 2.2e-16
plot(diameter_linear_model)

Линейная модель веса от высоты

height_linear_model <- lm(whole_weight~height, data)
height_linear_model
## 
## Call:
## lm(formula = whole_weight ~ height, data = data)
## 
## Coefficients:
## (Intercept)       height  
##     -0.5114       9.6054
summary(height_linear_model)
## 
## Call:
## lm(formula = whole_weight ~ height, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7487 -0.1488 -0.0346  0.1151  1.5238 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.51140    0.01516  -33.73   <2e-16 ***
## height       9.60540    0.10408   92.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2813 on 4174 degrees of freedom
## Multiple R-squared:  0.6711, Adjusted R-squared:  0.671 
## F-statistic:  8517 on 1 and 4174 DF,  p-value: < 2.2e-16
plot(height_linear_model)

  1. Устранение выбросов и построение новых моделей
    Линейная модель зависимости веса от диаметра без выбросов
data_without_emissions <- data[data$diameter>0.1, ]
diameter_linear_model_new <- lm(whole_weight~diameter, data_without_emissions)
diameter_linear_model_new
## 
## Call:
## lm(formula = whole_weight ~ diameter, data = data_without_emissions)
## 
## Coefficients:
## (Intercept)     diameter  
##      -1.048        4.598
summary(diameter_linear_model_new)
## 
## Call:
## lm(formula = whole_weight ~ diameter, data = data_without_emissions)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56815 -0.12225 -0.03874  0.07345  1.13705 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.04765    0.01216  -86.12   <2e-16 ***
## diameter     4.59849    0.02896  158.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1846 on 4169 degrees of freedom
## Multiple R-squared:  0.8581, Adjusted R-squared:  0.8581 
## F-statistic: 2.521e+04 on 1 and 4169 DF,  p-value: < 2.2e-16
plot(diameter_linear_model_new)

Линейная модель зависимости веса от высоты без выбросов

data_without_emissions <- data[data$height<0.4&data$height>0.05&data$diameter>0.1, ]
dw_linear_model <- lm(whole_weight~+height+diameter, data_without_emissions)
dw_linear_model
## 
## Call:
## lm(formula = whole_weight ~ +height + diameter, data = data_without_emissions)
## 
## Coefficients:
## (Intercept)       height     diameter  
##      -1.120        3.763        3.473
summary(dw_linear_model)
## 
## Call:
## lm(formula = whole_weight ~ +height + diameter, data = data_without_emissions)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52231 -0.10868 -0.03049  0.07438  1.01366 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.12005    0.01168  -95.91   <2e-16 ***
## height       3.76302    0.16194   23.24   <2e-16 ***
## diameter     3.47294    0.06292   55.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1664 on 4105 degrees of freedom
## Multiple R-squared:  0.8817, Adjusted R-squared:  0.8817 
## F-statistic: 1.53e+04 on 2 and 4105 DF,  p-value: < 2.2e-16
plot(dw_linear_model)

all_in_one_linear_model <- lm(whole_weight~.-shucked_weight-viscera_weight-shell_weight, data_without_emissions)
all_in_one_linear_model
## 
## Call:
## lm(formula = whole_weight ~ . - shucked_weight - viscera_weight - 
##     shell_weight, data = data_without_emissions)
## 
## Coefficients:
## (Intercept)    sexInfant      sexMale       length     diameter  
##   -1.157326    -0.021696     0.015360     1.911435     1.229664  
##      height        rings  
##    3.580197    -0.002294
summary(all_in_one_linear_model)
## 
## Call:
## lm(formula = whole_weight ~ . - shucked_weight - viscera_weight - 
##     shell_weight, data = data_without_emissions)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46840 -0.10704 -0.03456  0.06938  1.05602 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.1573263  0.0167308 -69.174  < 2e-16 ***
## sexInfant   -0.0216956  0.0075909  -2.858  0.00428 ** 
## sexMale      0.0153597  0.0061246   2.508  0.01219 *  
## length       1.9114347  0.1307500  14.619  < 2e-16 ***
## diameter     1.2296643  0.1636835   7.512 7.08e-14 ***
## height       3.5801973  0.1647054  21.737  < 2e-16 ***
## rings       -0.0022938  0.0009993  -2.295  0.02176 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1617 on 4101 degrees of freedom
## Multiple R-squared:  0.8885, Adjusted R-squared:  0.8883 
## F-statistic:  5446 on 6 and 4101 DF,  p-value: < 2.2e-16
plot(all_in_one_linear_model)

  1. Разделить массив на 2 случайные части и подогнать модель по первой
data.noout <- data_without_emissions
odds <- seq(1, nrow(data.noout), by=2)
data.in <- data.noout[odds, ]
data.out <- data.noout[-odds, ]

linear.model.half<-lm(whole_weight~.-shucked_weight-viscera_weight-shell_weight,data.in)
linear.model.half
## 
## Call:
## lm(formula = whole_weight ~ . - shucked_weight - viscera_weight - 
##     shell_weight, data = data.in)
## 
## Coefficients:
## (Intercept)    sexInfant      sexMale       length     diameter  
##   -1.158210    -0.024309     0.024274     1.899176     1.165536  
##      height        rings  
##    3.812492    -0.001947
summary(linear.model.half)
## 
## Call:
## lm(formula = whole_weight ~ . - shucked_weight - viscera_weight - 
##     shell_weight, data = data.in)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44573 -0.10895 -0.03478  0.07045  1.03577 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.158210   0.023804 -48.656  < 2e-16 ***
## sexInfant   -0.024309   0.010828  -2.245  0.02488 *  
## sexMale      0.024274   0.008793   2.761  0.00582 ** 
## length       1.899176   0.180661  10.512  < 2e-16 ***
## diameter     1.165536   0.227570   5.122 3.31e-07 ***
## height       3.812492   0.239939  15.889  < 2e-16 ***
## rings       -0.001947   0.001461  -1.332  0.18288    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1639 on 2047 degrees of freedom
## Multiple R-squared:  0.8888, Adjusted R-squared:  0.8885 
## F-statistic:  2727 on 6 and 2047 DF,  p-value: < 2.2e-16
plot(linear.model.half)

Cпрогнозировать (функция predict) значения во второй части

data.predict <- predict(linear.model.half, data.out)
plot (data.out$whole_weight, data.predict)

Проверка качества прогноза

cor (data.out$whole_weight, data.predict)
## [1] 0.9424124